WebDBInjector.java example

Explorer

damp.ekeko.snippets-master
- damp.ekeko.snippets.plugin
  - src
    - damp
      - ekeko
        snippets
        BoundDirective.java
        DirectiveOperandBinding.java
        EkekoSnippetsPlugin.java
        ExtractedSnippet.java
        NaiveASTFlattener.java
        OperatorOperandBinding.java
        SnippetBaseListener.java
        SnippetBaseVisitor.java
        SnippetExtractor.java
        SnippetLexer.java
        SnippetListener.java
        SnippetParser.java
        SnippetVisitor.java
        data
        SnippetOperator.java
        TemplateGroup.java
        geneticsearch
        PartialJavaProjectModel.java
        gui
        BoundDirectivesEditorDialog.java
        BoundDirectivesViewer.java
        ChartCanvas.java
        ClojureFileEditorInput.java
        DirectiveOperandBindingEditingSupport.java
        DirectiveOperandBindingLabelProviderValue.java
        DirectiveSelectionDialog.java
        IntendedResultsEditor.java
        IntendedResultsEditorCommandHandler.java
        IntendedResultsEditorInput.java
        IntendedResultsEditorPersistableElementFactory.java
        MutationHistoryDialog.java
        OperandBindingLabelProviderDescription.java
        OperatorOperandBindingEditingSupport.java
        OperatorOperandBindingLabelProviderValue.java
        OperatorOperandsView.java
        OperatorOperandsViewer.java
        OperatorTreeContentProvider.java
        OperatorTreeLabelProvider.java
        PopulationInspectorDialog.java
        QueryInspectorDialog.java
        RecommendationEditor.java
        RecommendationEditorCommandHandler.java
        RecommendationEditorInput.java
        RecommendationEditorPersistableElementFactory.java
        RewritesTemplateEditor.java
        SubjectsTemplateEditor.java
        TemplateCodeGenerator.java
        TemplateEditor.java
        TemplateEditorActionBarContributor.java
        TemplateEditorCommandHandler.java
        TemplateEditorInput.java
        TemplateEditorPersistableElementFactory.java
        TemplateGroupNodeSelectionDialog.java
        TemplateGroupTemplateElement.java
        TemplateGroupViewer.java
        TemplateGroupViewerNodeDoubleClickListener.java
        TemplateGroupViewerNodeSelectionEvent.java
        TemplateGroupViewerNodeSelectionListener.java
        TemplatePrettyPrinter.java
        TemplateTreeContentProvider.java
        TemplateTreeLabelProviders.java
        TransformationEditor.java
        TransformationEditorActionBarContributor.java
        TransformationEditorCommandHandler.java
        TransformationEditorInput.java
        TransformationEditorPersistableElementFactory.java
        TransformationOverviewEditor.java
    - ec
      - util
        MersenneTwister.java
- damp.ekeko.snippets.plugin.test
  - resources
  - src
    - test
      - damp
        ekeko
        snippets
        EkekoSnippetsTest.java
        experiments
        GeneticSearchTest.java

/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.db;

import java.io.*;
import java.net.*;
import java.util.*;
import java.util.logging.*;
import java.net.MalformedURLException;

import javax.xml.parsers.*;
import org.xml.sax.*;
import org.xml.sax.helpers.*;
import org.apache.xerces.util.XMLChar;

import net.nutch.io.*;
import net.nutch.net.*;
import net.nutch.util.*;
import net.nutch.pagedb.*;
import net.nutch.linkdb.*;
import net.nutch.util.NutchConf;

/*********************************************
 * This class takes a flat file of URLs and adds
 * them as entries into a pagedb.  Useful for 
 * bootstrapping the system.
 *
 * @author Mike Cafarella
 * @author Doug Cutting
 *********************************************/
public class WebDBInjector {
    private static final String DMOZ_PAGENAME = "http://www.dmoz.org/";

    private static final byte DEFAULT_INTERVAL =
      (byte)NutchConf.getInt("db.default.fetch.interval", 30);

    private static final float NEW_INJECTED_PAGE_SCORE =
      NutchConf.getFloat("db.score.injected", 2.0f);

    public static final Logger LOG = LogFormatter.getLogger("net.nutch.db.WebDBInjector");

    /**
     * This filter fixes characters that might offend our parser.
     * This lets us be tolerant of errors that might appear in the input XML.
     */
    private static class XMLCharFilter extends FilterReader {
      private boolean lastBad = false;

      public XMLCharFilter(Reader reader) {
        super(reader);
      }

      public int read() throws IOException {
        int c = in.read();
        int value = c;
        if (c != -1 && !(XMLChar.isValid(c)))     // fix invalid characters
          value = 'X';
        else if (lastBad && c == '<') {           // fix mis-matched brackets
          in.mark(1);
          if (in.read() != '/')
            value = 'X';
          in.reset();
        }
        lastBad = (c == 65533);

        return value;
      }

      public int read(char[] cbuf, int off, int len)
        throws IOException {
        int n = in.read(cbuf, off, len);
        if (n != -1) {
          for (int i = 0; i < n; i++) {
            char c = cbuf[off+i];
            char value = c;
            if (!(XMLChar.isValid(c)))            // fix invalid characters
              value = 'X';
            else if (lastBad && c == '<') {       // fix mis-matched brackets
              if (i != n-1 && cbuf[off+i+1] != '/')
                value = 'X';
            }
            lastBad = (c == 65533);
            cbuf[off+i] = value;
          }
        }
        return n;
      }
    }


    /**
     * The RDFProcessor receives tag messages during a parse
     * of RDF XML data.  We build whatever structures we need
     * from these messages.
     */
    class RDFProcessor extends DefaultHandler {
        String curURL = null, curSection = null;
        boolean titlePending = false, descPending = false, insideAdultSection = false;
        StringBuffer title = new StringBuffer(), desc = new StringBuffer();
        XMLReader reader;
        int subsetDenom;
        int hashSkew;
        boolean includeAdult, includeDmozDesc;
        MD5Hash srcDmozID;
        long srcDmozDomainID;
        Locator location;

        /**
         * Pass in an XMLReader, plus a flag as to whether we 
         * should include adult material.
         */
        public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult, boolean includeDmozDesc, int skew) throws IOException {
            this.reader = reader;
            this.subsetDenom = subsetDenom;
            this.includeAdult = includeAdult;
            this.includeDmozDesc = includeDmozDesc;

            // We create a Page entry for the "Dmoz" page, from
            // which all descriptive links originate.  The name
            // of this page is always the same, stored in 
            // DMOZ_PAGENAME.  The MD5 is generated over the current
            // timestamp.  Until this page is deleted, the descriptive
            // links will always be kept.
            //
            // If the DMOZ page is updated with new content, you 
            // *could* update these links, if you really wanted to.
            // Just run inject again!  This will replace the old
            // Dmoz Page, because we always keep the same name.
            // That obsolete Page will be deleted, and all its 
            // outlinks (the descriptive ones) garbage-collected.
            // 
            // Then we just proceed to add the new descriptive 
            // links, with the brand-new page's src MD5.
            //
            this.srcDmozID = MD5Hash.digest(DMOZ_PAGENAME + "_" + nextFetch);
            Page dmozPage = new Page(DMOZ_PAGENAME, srcDmozID);
            dmozPage.setNextFetchTime(Long.MAX_VALUE);
            dbWriter.addPageIfNotPresent(dmozPage);

            this.srcDmozDomainID = MD5Hash.digest(new URL(DMOZ_PAGENAME).getHost()).halfDigest();

            this.hashSkew = skew != 0 ? skew : new Random().nextInt();
        }

        //
        // Interface ContentHandler
        //

        /**
         * Start of an XML elt
         */
        public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
            if ("Topic".equals(qName)) {
                curSection = atts.getValue("r:id");
            } else if ("ExternalPage".equals(qName)) {
                // Porn filter
                if ((! includeAdult) && curSection.startsWith("Top/Adult")) {
                    return;
                }

                // Subset denominator filter.  
                // Only emit with a chance of 1/denominator.
                String url = atts.getValue("about");
                int hashValue = MD5Hash.digest(url).hashCode();
                hashValue = Math.abs(hashValue ^ hashSkew);
                if ((hashValue % subsetDenom) != 0) {
                    return;
                }

                // We actually claim the URL!
                curURL = url;
            } else if (curURL != null && "d:Title".equals(qName)) {
                titlePending = true;
            } else if (curURL != null && "d:Description".equals(qName)) {
                descPending = true;
            }
        }

        /**
         * The contents of an XML elt
         */
        public void characters(char ch[], int start, int length) {
            if (titlePending) {
                title.append(ch, start, length);
            } else if (descPending) {
                desc.append(ch, start, length);
            }
        }

        /**
         * Termination of XML elt
         */
        public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
            if (curURL != null) {
                if ("ExternalPage".equals(qName)) {
                    //
                    // Inc the number of pages, insert the page, and 
                    // possibly print status.
                    //
                    try {
                      // First, manufacture the Page entry for the
                      // given DMOZ listing.
                      if (addPage(curURL)) {

                        // Second, add a link from the DMOZ page TO the
                        // just-added target Page.  The anchor text should 
                        // be the merged Title and Desc that we get from 
                        // the DMOZ listing.  For testing reasons, the 
                        // caller may choose to disallow this.
                        if (includeDmozDesc) {
                          String fullDesc = title + " " + desc;
                          Link descLink = new Link(srcDmozID, srcDmozDomainID, curURL, fullDesc);
                          dbWriter.addLink(descLink);
                        }
                        pages++;
                      }

                    } catch (MalformedURLException e) {
                        LOG.fine("skipping " + curURL + ":" + e);
                    } catch (IOException ie) {
                        LOG.severe("problem adding url " + curURL + ": " + ie);
                    }
                    printStatusBar(2000, 50000);

                    //
                    // Clear out the link text.  This is what
                    // you would use for adding to the linkdb.
                    //
                    if (title.length() > 0) {
                        title.delete(0, title.length());
                    }
                    if (desc.length() > 0) {
                        desc.delete(0, desc.length());
                    }

                    // Null out the URL.
                    curURL = null;
                } else if ("d:Title".equals(qName)) {
                    titlePending = false;
                } else if ("d:Description".equals(qName)) {
                    descPending = false;
                }
            }
        }

        /**
         * When parsing begins
         */
        public void startDocument() {
            LOG.info("Begin parse");
        }

        /**
         * When parsing ends
         */
        public void endDocument() {
            LOG.info("Completed parse.  Added " + pages + " pages.");
        }

        /**
         * From time to time the Parser will set the "current location"
         * by calling this function.  It's useful for emitting locations
         * for error messages.
         */
        public void setDocumentLocator(Locator locator) {
            location = locator;
        }


        //
        // Interface ErrorHandler
        //

        /**
         * Emit the exception message
         */
        public void error(SAXParseException spe) {
            LOG.severe("Error: " + spe.toString() + ": " + spe.getMessage());
            spe.printStackTrace(System.out);
        }

        /**
         * Emit the exception message, with line numbers
         */
        public void fatalError(SAXParseException spe) {
            LOG.severe("Fatal error: " + spe.toString() + ": " + spe.getMessage());
            LOG.severe("Last known line is " + location.getLineNumber() + ", column " + location.getColumnNumber());
            spe.printStackTrace(System.out);
        }
        
        /**
         * Emit exception warning message
         */
        public void warning(SAXParseException spe) {
            LOG.warning("Warning: " + spe.toString() + ": " + spe.getMessage());
            spe.printStackTrace(System.out);
        }
    }

    private IWebDBWriter dbWriter;

    /**
     * WebDBInjector takes a reference to a WebDBWriter that it should add to.
     */
    public WebDBInjector(IWebDBWriter dbWriter) {
        this.dbWriter = dbWriter;
    }

    /**
     * Close dbWriter and save changes
     */
    public void close() throws IOException {
        dbWriter.close();
    }

    /**
     * Utility to present small status bar
     */
    public void printStatusBar(int small, int big){
        if ((pages % small ) == 0) {
            System.out.print(".");
        }
        if ((pages % big ) == 0) {
            printStatus();
        }
    }

    long startTime = System.currentTimeMillis();
    long pages = 0;
    long nextFetch = System.currentTimeMillis();

    /**
     * Utility to present performance stats
     */
    public void printStatus(){
        long elapsed = (System.currentTimeMillis() - this.startTime); 
        if ( this.pages == 0) {
        } else {
            LOG.info("\t" + this.pages + "\t" + 
                     (int)((1000 *  pages)/elapsed) + " pages/second\t" );
        }
    }

    /**
     * Iterate through all the items in this flat text file and
     * add them to the db.
     */
    public void injectURLFile(File urlList) throws IOException {
        nextFetch = urlList.lastModified();
        BufferedReader reader = new BufferedReader(new FileReader(urlList));
        try {
            String curStr = null; 
            LOG.info("Starting URL processing");
            while ((curStr = reader.readLine()) != null) {
                String url = curStr.trim();
                if (addPage(url))
                  this.pages++;
                printStatusBar(2000,50000);
            }
            LOG.info("Added " + pages + " pages");
        } catch (Exception e) {
          LOG.severe("error while injecting:" + e);
        } finally {
          reader.close();
        }
    }

    /**
     * Iterate through all the items in this structured DMOZ file.
     * Add each URL to the web db.
     */
    public void injectDmozFile(File dmozFile, int subsetDenom, boolean includeAdult, boolean includeDmozDesc, int skew) throws IOException, SAXException, ParserConfigurationException {
        nextFetch = dmozFile.lastModified();

        SAXParserFactory parserFactory = SAXParserFactory.newInstance();
        SAXParser parser = parserFactory.newSAXParser();
        XMLReader reader = parser.getXMLReader();

        // Create our own processor to receive SAX events
        RDFProcessor rp =
          new RDFProcessor(reader, subsetDenom, includeAdult, includeDmozDesc, skew);
        reader.setContentHandler(rp);
        reader.setErrorHandler(rp);
        LOG.info("skew = " + rp.hashSkew);

        //
        // Open filtered text stream.  The UTF8Filter makes sure that
        // only appropriate XML-approved UTF8 characters are received.
        // Any non-conforming characters are silently skipped.
        //
        XMLCharFilter in = new XMLCharFilter(new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), "UTF-8")));
        try {
            InputSource is = new InputSource(in);
            reader.parse(is);
        } catch (Exception e) {
            LOG.severe(e.toString());
            e.printStackTrace(System.out);
            System.exit(0);
        } finally {
            in.close();
        }
    }

    private boolean addPage(String url) throws IOException {
      url = URLFilterFactory.getFilter().filter(url);
      if (url != null) {
        Page page = new Page(url, NEW_INJECTED_PAGE_SCORE, nextFetch);
        dbWriter.addPageIfNotPresent(page);
        return true;
      }
      return false;
    }

    /**
     * Command-line access.  User may add URLs via a flat text file
     * or the structured DMOZ file.  By default, we ignore Adult
     * material (as categorized by DMOZ).
     */
    public static void main(String argv[]) throws Exception {
      if (argv.length < 3) {
        System.out.println("Usage: WebDBInjector <db_dir> (-urlfile <url_file> | -dmozfile <dmoz_file>) [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-noDmozDesc]");
        return;
      }

      //
      // Parse the command line, figure out what kind of
      // URL file we need to load
      //
      int subsetDenom = 1;
      int skew = 0;
      String dbDir = null, command = null, loadfile = null;
      boolean includeAdult = false, includeDmozDesc = true;

      for (int i = 0; i < argv.length; i++) {
        if ("-urlfile".equals(argv[i]) || 
            "-dmozfile".equals(argv[i])) {
          command = argv[i];
          loadfile = argv[i+1];
          i++;
        } else if ("-includeAdultMaterial".equals(argv[i])) {
          includeAdult = true;
        } else if ("-noDmozDesc".equals(argv[i])) {
          includeDmozDesc = false;
        } else if ("-subset".equals(argv[i])) {
          subsetDenom = Integer.parseInt(argv[i+1]);
          i++;
        } else if ("-skew".equals(argv[i])) {
          skew = Integer.parseInt(argv[i+1]);
          i++;
        } else {
          dbDir = argv[i];
        }
      }

      //
      // Create the webdbWriter, the injector, and then inject the
      // right kind of URL file.
      //
      IWebDBWriter writer = new WebDBWriter(new File(dbDir));
      WebDBInjector injector = new WebDBInjector(writer);
      if ("-urlfile".equals(command)) {
        injector.injectURLFile(new File(loadfile));
      } else if ("-dmozfile".equals(command)) {
        injector.injectDmozFile(new File(loadfile), subsetDenom, includeAdult, includeDmozDesc, skew);
      } else {
        System.out.println("No command indicated.");
        return;
      }

      injector.close();
    }
}